{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Example Pytorch code for what the paper's model might look like.\n",
    "A quick tutorial on Pytorch can be found at https://cs230-stanford.github.io/pytorch-getting-started.html\n",
    "There's also the Pytorch documentation intro at https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html\n",
    "which goes into a few more details.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "\n",
    "from torch.utils.data import Dataset\n",
    "from torch.utils.data import DataLoader\n",
    "\n",
    "\n",
    "torch.manual_seed(1)\n",
    "\n",
    "import numpy as np\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Modified LSTM modal\n",
    "This contains both the embedding layers, Pytorch's default LSTM layer,\n",
    "and 2 linear layers \n",
    "'''\n",
    "class PathLSTM(nn.Module):\n",
    "\n",
    "    def __init__(self, e_emb_dim, t_emb_dim, r_emb_dim, hidden_dim, e_vocab_size, \n",
    "                 t_vocab_size, r_vocab_size, tagset_size):\n",
    "        super(PathLSTM, self).__init__()\n",
    "        self.hidden_dim = hidden_dim\n",
    "\n",
    "        self.entity_embeddings = nn.Embedding(e_vocab_size, e_emb_dim)\n",
    "        self.type_embeddings = nn.Embedding(t_vocab_size, t_emb_dim)\n",
    "        self.rel_embeddings = nn.Embedding(r_vocab_size, r_emb_dim)\n",
    "\n",
    "        # The LSTM takes word embeddings as inputs, and outputs hidden states\n",
    "        # with dimensionality hidden_dim.\n",
    "        self.lstm = nn.LSTM(e_emb_dim + t_emb_dim + r_emb_dim, hidden_dim)\n",
    "\n",
    "        # The linear layer that maps from hidden state space to to tags\n",
    "        self.linear1 = nn.Linear(hidden_dim, hidden_dim)\n",
    "        self.linear2 = nn.Linear(hidden_dim, tagset_size)\n",
    "\n",
    "    def forward(self, paths, path_lengths):      \n",
    "        #transpose, so entities 1st row, types 2nd row, and relations 3nd (these are dim 1 and 2 since batch is 0)\n",
    "        #this could just be the input if we want\n",
    "        t_paths = torch.transpose(paths, 1, 2)\n",
    "        \n",
    "        #then concatenate embeddings, batch is index 0, so selecting along index 1\n",
    "        #right now we do fetch embedding for padding tokens, but that these aren't used\n",
    "        entity_embed = self.entity_embeddings(t_paths[:,0,:])\n",
    "        type_embed = self.type_embeddings(t_paths[:,1,:])\n",
    "        rel_embed = self.rel_embeddings(t_paths[:,2,:])\n",
    "        triplet_embed = torch.cat((entity_embed, type_embed, rel_embed), 2) #concatenates lengthwise\n",
    "        \n",
    "        #we need dimensions to be input size x batch_size x embedding dim, so transpose first 2 dim\n",
    "        batch_sec_embed = torch.transpose(triplet_embed, 0 , 1)\n",
    "        \n",
    "        #pack padded sequences, so we don't do extra computation\n",
    "        packed_embed = nn.utils.rnn.pack_padded_sequence(batch_sec_embed, path_lengths)\n",
    "        \n",
    "        #last_out is the output state before padding for each path, since we only want final output\n",
    "        packed_out, (last_out, _) = self.lstm(packed_embed)\n",
    "    \n",
    "        ##can visualize unpacked seq to see that last_out is what we want\n",
    "        #lstm_out, lstm_out_lengths = nn.utils.rnn.pad_packed_sequence(packed_out)\n",
    "        #print(lstm_out, lstm_out_lengths)\n",
    "        \n",
    "        #pass through linear layers\n",
    "        tag_scores = self.linear2(F.relu(self.linear1(last_out[-1])))\n",
    "        \n",
    "        #Paper uses relu as final activation, but for Pytorch's nllloss it seems like we need a softmax layer\n",
    "        #to convert to probability distribution?\n",
    "        #return F.relu(tag_score)\n",
    "        return F.log_softmax(tag_scores, dim=1)\n",
    "      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#For now just construct example, later would want to automatically create maps from vocab\n",
    "e_to_ix = {'Sam': 0, 'Weijia': 1, 'Rosa': 2, 'Joey':3, 'Song1': 4, 'Song2': 5, 'Song3': 6, 'Pop': 7, '#PAD': 8}\n",
    "t_to_ix = {'u': 0, 's': 1, 't': 2, '#PAD': 3}\n",
    "r_to_ix = {'rate': 0, 'category': 1, 'belong': 2, '_rate': 3, '_category': 4, '_belong':5, 'UNK': 6, '#PAD': 7}\n",
    "\n",
    "#this could be transposed to [[entity1, entity2, ...], [type1, type2, ...], [rel1, rel2, ...]]\n",
    "#since we do that in the model\n",
    "training_data = [\n",
    "    ([['Sam', 'u', 'rate'], ['Song1', 's', 'category'], ['Pop', 't', '_belong'], ['Song2', 's', 'UNK']], 1),\n",
    "    ([['Sam', 'u', 'rate'], ['Song2', 's', 'UNK']], 1),\n",
    "    ([['Sam', 'u', 'rate'], ['Song1', 's', '_rate'], ['Joey', 'u', 'rate'],['Song3', 's', 'UNK']], 0), \n",
    "    ([['Sam', 'u', 'rate'], ['Song1', 's', '_rate'], ['Song3', 's', 'UNK']], 0)\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#find max path length\n",
    "def find_max_length(data):\n",
    "    max_len = 0\n",
    "    for (path, _) in data:\n",
    "        max_len = max(len(path), max_len)\n",
    "    return max_len\n",
    "\n",
    "#construct tensor of item, type, and relation ids\n",
    "def prepare_path(seq, e_to_ix, t_to_ix, r_to_ix, max_len):\n",
    "    relation_padding =  r_to_ix['#PAD']\n",
    "    type_padding = t_to_ix['#PAD']\n",
    "    entity_padding = e_to_ix['#PAD']\n",
    "    id_pairs = []\n",
    "    for step in seq:\n",
    "        e,t,r = step[0], step[1], step[2]\n",
    "        id_pairs.append([e_to_ix[e], t_to_ix[t], r_to_ix[r]])\n",
    "    \n",
    "    while len(id_pairs) < max_len:\n",
    "        id_pairs.append([entity_padding, type_padding, relation_padding])\n",
    "        \n",
    "    return torch.tensor(id_pairs, dtype=torch.long)\n",
    "\n",
    "max_len = find_max_length(training_data)\n",
    "\n",
    "#formatted data are tuples of (path, tag, path_length), where the path is padded\n",
    "formatted_data = []\n",
    "for path, tag in training_data:\n",
    "    formatted_data.append((prepare_path(path, e_to_ix, t_to_ix, r_to_ix, max_len), tag, len(path)))\n",
    "print(formatted_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#sorts a batch of paths by path length, in decreasing order\n",
    "def sort_batch(batch, targets, lengths):\n",
    "    seq_lengths, perm_idx = lengths.sort(0, descending=True)\n",
    "    seq_tensor = batch[perm_idx]\n",
    "    target_tensor = targets[perm_idx]\n",
    "    return seq_tensor, target_tensor, seq_lengths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "E_EMBEDDING_DIM = 3 #64 in paper\n",
    "T_EMBEDDING_DIM = 3 #32 in paper\n",
    "R_EMBEDDING_DIM = 3 #32 in paper\n",
    "HIDDEN_DIM = 6 #this might be unit number = 256\n",
    "TARGET_SIZE = 2\n",
    "\n",
    "model = PathLSTM(E_EMBEDDING_DIM, T_EMBEDDING_DIM, R_EMBEDDING_DIM, HIDDEN_DIM, len(e_to_ix), len(t_to_ix), len(r_to_ix), TARGET_SIZE)\n",
    "loss_function = nn.NLLLoss() #negative log likelihood loss\n",
    "#loss_function = nn.CrossEntropyLoss() #This seems to work with relu activation but nllloss does not\n",
    "#this is because crossEntropyLoss actually automatically adds the softmax layer to normalize results into p-distribution\n",
    "\n",
    "\n",
    "# l2 regularization is tuned from {10−5 , 10−4 , 10−3 , 10−2 }, I think this is weight decay\n",
    "# Learning rate is found from {0.001, 0.002, 0.01, 0.02} with grid search\n",
    "optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=.001)\n",
    "\n",
    "#DataLoader used for batches\n",
    "train_loader = DataLoader(dataset=formatted_data, batch_size=3, shuffle=False)\n",
    "\n",
    "for epoch in range(300):  # tiny data so 300 epochs\n",
    "    for path_batch, targets, lengths in train_loader:\n",
    "        \n",
    "        #sort based on path lengths, largest first, so that we can pack paths\n",
    "        s_path_batch, s_targets, s_lengths = sort_batch(path_batch, targets, lengths)\n",
    "        \n",
    "        #Pytorch accumulates gradients, so we need to clear before each instance\n",
    "        model.zero_grad()\n",
    "\n",
    "        #Run the forward pass.\n",
    "        tag_scores = model(s_path_batch, s_lengths)\n",
    "\n",
    "        #Compute the loss, gradients, and update the parameters by calling .step()\n",
    "        loss = loss_function(tag_scores, s_targets)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "        # print statistics\n",
    "        if epoch % 30 == 0:\n",
    "            print(\"loss is:\", loss.item())\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_to_etr(e_to_ix, t_to_ix, r_to_ix, path, length):\n",
    "    ix_to_t = {v: k for k, v in t_to_ix.items()}\n",
    "    ix_to_r = {v: k for k, v in r_to_ix.items()}\n",
    "    ix_to_e = {v: k for k, v in e_to_ix.items()}\n",
    "    new_path = []\n",
    "    for i,step in enumerate(path):\n",
    "        if i == length:\n",
    "            break\n",
    "        new_path.append([ix_to_e[step[0].item()], ix_to_t[step[1].item()], ix_to_r[step[2].item()]])\n",
    "    return new_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# See what the scores are after training, on the training dataset\n",
    "# Right now we need to sort by length first, so output is results in that sorted order\n",
    "# need way to map from original sentence to result\n",
    "print(\"paths and scores of form: [loss for tag 0, loss for tag 1]:\")\n",
    "print()\n",
    "with torch.no_grad():\n",
    "    test_loader = DataLoader(dataset=formatted_data, batch_size=4, shuffle=False)\n",
    "    for path_batch, target_batch, lengths in test_loader:\n",
    "        s_path_batch, s_targets, s_lengths = sort_batch(path_batch, target_batch, lengths)\n",
    "        tag_scores = model(s_path_batch, s_lengths)\n",
    "        \n",
    "        for i, ix_path in enumerate(s_path_batch):\n",
    "            print(convert_to_etr(e_to_ix, t_to_ix, r_to_ix, ix_path, s_lengths[i]))\n",
    "            print(tag_scores[i])\n",
    "            \n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
